import matplotlib
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
plt.style.use('seaborn-whitegrid')
font = {'family' : 'Times',
'weight' : 'bold',
'size' : 12}
matplotlib.rc('font', **font)
# Aux functions
def plot_grid(Xs, Ys, axs=None):
''' Aux function to plot a grid'''
t = np.arange(Xs.size) # define progression of int for indexing colormap
if axs:
axs.plot(0, 0, marker='*', color='r', linestyle='none') #plot origin
axs.scatter(Xs,Ys, c=t, cmap='jet', marker='.') # scatter x vs y
axs.axis('scaled') # axis scaled
else:
plt.plot(0, 0, marker='*', color='r', linestyle='none') #plot origin
plt.scatter(Xs,Ys, c=t, cmap='jet', marker='.') # scatter x vs y
plt.axis('scaled') # axis scaled
def linear_map(A, Xs, Ys):
'''Map src points with A'''
# [NxN,NxN] -> NxNx2 # add 3-rd axis, like adding another layer
src = np.stack((Xs,Ys), axis=Xs.ndim)
# flatten first two dimension
# (NN)x2
src_r = src.reshape(-1,src.shape[-1]) #ask reshape to keep last dimension and adjust the rest
# 2x2 @ 2x(NN)
dst = A @ src_r.T # 2xNN
#(NN)x2 and then reshape as NxNx2
dst = (dst.T).reshape(src.shape)
# Access X and Y
return dst[...,0], dst[...,1]
def plot_points(ax, Xs, Ys, col='red', unit=None, linestyle='solid'):
'''Plots points'''
ax.set_aspect('equal')
ax.grid(True, which='both')
ax.axhline(y=0, color='gray', linestyle="--")
ax.axvline(x=0, color='gray', linestyle="--")
ax.plot(Xs, Ys, color=col)
if unit is None:
plotVectors(ax, [[0,1],[1,0]], ['gray']*2, alpha=1, linestyle=linestyle)
else:
plotVectors(ax, unit, [col]*2, alpha=1, linestyle=linestyle)
def plotVectors(ax, vecs, cols, alpha=1, linestyle='solid'):
'''Plot set of vectors.'''
for i in range(len(vecs)):
x = np.concatenate([[0,0], vecs[i]])
ax.quiver([x[0]],
[x[1]],
[x[2]],
[x[3]],
angles='xy', scale_units='xy', scale=1, color=cols[i],
alpha=alpha, linestyle=linestyle, linewidth=2)
On next Thursday 11 April 2024 at 2pm we will do a self-assessment test to evaluate yourself on this first part of the program.
Please fill this form https://forms.gle/zsptFtQ6cdQQ1WA89 by Wed. 10 April @ 3pm
Please bring:
Self-Assessment will cover the first part on:
Objective and Motivation: The goal of unsupervised learning is to find hidden patterns in unlabeled data.
\begin{equation} \underbrace{\{\mathbf{x}_i\}_{i=1}^N}_{\text{known}} \sim \underbrace{\mathcal{D}}_{\text{unknown}} \end{equation}Unlike in supervised learning, any data points is not paired with a label.
As you can see the unsupervised learning problem is ill-posed (which hidden patterns?) and in principle more difficult than supervised learning.
Unsupervised learning can be thought of as "finding structure" in the data.
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=150,
n_features=2,
centers=3,
cluster_std=0.3,
shuffle=True,
random_state=0)
plt.scatter(X[:, 0], X[:, 1], c='black', marker='o', s=50)
plt.show()
findfont: Font family ['Times'] not found. Falling back to DejaVu Sans.
plt.scatter(X[:, 0], X[:, 1], c=y, marker='o', s=50, cmap='jet')
plt.show()
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=150,
n_features=2,
centers=3,
cluster_std=0.5,
shuffle=True,
random_state=0)
from sklearn.datasets import make_blobs
X2, y2 = make_blobs(n_samples=150,
n_features=2,
centers=3,
cluster_std=.7,
shuffle=True,
random_state=0)
plt.scatter(X2[:, 0], X2[:, 1], c='black', marker='o', s=50)
plt.show()
data points 3 maps to the cluster centroid 2plt.scatter(X[:, 0], X[:, 1], c='black', marker='o', s=50)
plt.show()
from sklearn.cluster import KMeans
km = KMeans(n_clusters=3,
init='random',
n_init=10,
max_iter=300,
tol=1e-04,
random_state=0)
y_km = km.fit_predict(X)
plt.scatter(X[y_km == 0, 0],
X[y_km == 0, 1],
s=50,
c='lightgreen',
marker='s',
label='cluster 1')
plt.scatter(X[y_km == 1, 0],
X[y_km == 1, 1],
s=50,
c='orange',
marker='o',
label='cluster 2')
plt.scatter(X[y_km == 2, 0],
X[y_km == 2, 1],
s=50,
c='lightblue',
marker='v',
label='cluster 3')
plt.scatter(km.cluster_centers_[:, 0],
km.cluster_centers_[:, 1],
s=250,
marker='*',
c='red',
label='centroids')
plt.legend()
plt.tight_layout()
plt.show()
C:\Users\wangf\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn(
Remember that we have to find:
plt.scatter(X[:, 0], X[:, 1], c='black', marker='o', s=50)
plt.show()
Initialization: Random sample the $K$ centroids $\{ \mbf{\mu}_1, \ldots, \mbf{\mu}_k\}$
A trick is using available points (just choose a few of them)
For every point $i=[1,\ldots,N]$ compute the assignment wrt to the $K$ clusters.
#data points x #clusters)Given a point $\mbf{x}_i$, we have to find the label of the "closest" centroids in $\{ \mbf{\mu}_1, \ldots, \mbf{\mu}_k\}$
At the end of this step we should have a pair of aligned set $\{ \mbf{x}_1, \ldots, \mbf{x_n}\}$ and $\{ {y}_1, \ldots, {y_n}\}$
Given a $\mbf{\mu}_k$, we update it as $\mbf{\mu}_k \leftarrow \frac{\sum_i \delta\{y_i=k\}\mbf{x}_i}{\sum_i \delta\{y_i=k\}}$
It is a formalism to express the concept of:
Final result

What does "convergence" mean?
We start from this {{print(X.shape[0])}} points in {{print(X.shape[1])}}-D space
{{ plt.scatter(X[:, 0], X[:, 1], c='black', marker='o', s=50); plt.show() }}
def init(X, k):
N, dim = X.shape
# we select k samples from the points we have randomly
idx = np.random.choice(N, k, replace=False)
# the centers are just the k-random sampled points
return X[idx]
def get_assignments(X, centers):
P = X[:, np.newaxis, :] # Nx1xD
diff_sq = (P - centers)**2 # Nx1xD - KxD -> NxKxD # broadcasting
# more info on broadcasting https://numpy.org/doc/stable/user/basics.broadcasting.html
# dist = sum_over_axis(diff^2) then take argmin of dist across centers
y = diff_sq.sum(axis=2).argmin(axis=1)
return y # Nx1 and the values inside go from [1...K]
def update_centers(X, y, K):
# select the points to the k center and remove N dimension with mean over axis 0
y_row = y.reshape((1, -1)) #1xN
k_col = np.arange(K).reshape(-1, 1) #Kx1
#M[i][j] = 1 if X[j,:] belongs to cluster i else 0
M = k_col == y_row #KxN
#cluster_sum[i,:] contains the sum of all vectors in cluster i
cluster_sum = M @ X #KxN @ NxD -> KxD
#counts[i,:] contains the number of vectors in cluster i
counts = np.sum(M, axis=1).reshape(-1, 1) #Kx1
return cluster_sum / counts #KxD
# Double check
np.random.seed(0) # make the method deterministic
from sklearn.metrics import pairwise_distances_argmin
centers = init(X, 3)
y_assignments = get_assignments(X, centers)
labels = pairwise_distances_argmin(X, centers)
assert np.array_equal(labels, y_assignments), 'Assignment not correct!'
def kmeans(X, n_clusters):
np.random.seed(0) # make the method deterministic
iter = 1
#1. init random sample the centers
centers = init(X, n_clusters)
while True:
# 2. Assignment step
y_assignments = get_assignments(X, centers)
# 3. Update step
new_centers = update_centers(X, y_assignments, n_clusters)
# 4. Check for convergence
# This is a possible implementation
# A more formal is to check that the loss is plateauing (more on this later)
if np.allclose(centers, new_centers):
break
# update cluster for next iter
centers = new_centers
print(f'Iteration {iter}')
iter += 1
return centers, y_assignments
centers, y_est = kmeans(X, 3)
plt.scatter(*X.T, c=y_est,
s=50, cmap='viridis')
_ = plt.scatter(*centers.T,
s=250,
marker='*',
c='red',
label='centroids')
Iteration 1 Iteration 2 Iteration 3
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=5000,
n_features=3,
centers=25,
cluster_std=1.25,
shuffle=True,
random_state=0)
fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(projection='3d')
ax.scatter(*X.T, c=y,
s=50, cmap='jet')
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
_ = ax.set_zlabel('Z Label')
findfont: Font family ['Times'] not found. Falling back to DejaVu Sans.
centers, y_est = kmeans(X, 20)
Iteration 1 Iteration 2 Iteration 3 Iteration 4 Iteration 5 Iteration 6 Iteration 7 Iteration 8 Iteration 9 Iteration 10 Iteration 11 Iteration 12 Iteration 13 Iteration 14 Iteration 15 Iteration 16 Iteration 17 Iteration 18 Iteration 19 Iteration 20 Iteration 21 Iteration 22 Iteration 23 Iteration 24 Iteration 25 Iteration 26 Iteration 27 Iteration 28 Iteration 29 Iteration 30 Iteration 31 Iteration 32
fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(projection='3d')
ax.scatter(*X.T, c=y_est,
s=10, cmap='viridis')
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
_ = ax.set_zlabel('Z Label')
_ = ax.scatter(*centers.T,
s=250,
marker='*',
c='red',
label='centroids')
from sklearn.datasets import make_blobs
from sklearn.metrics import pairwise_distances_argmin
X, y_true = make_blobs(n_samples=300, centers=4,
cluster_std=0.60, random_state=0)
rng = np.random.RandomState(42)
centers = [0, 4] + rng.randn(4, 2)
def draw_points(ax, c, factor=1):
ax.scatter(X[:, 0], X[:, 1], c=c, cmap='viridis',
s=50 * factor, alpha=0.3)
def draw_centers(ax, centers, factor=1, alpha=1.0):
ax.scatter(centers[:, 0], centers[:, 1],
c=np.arange(4), cmap='viridis', s=200 * factor,
alpha=alpha)
ax.scatter(centers[:, 0], centers[:, 1],
c='black', s=50 * factor, alpha=alpha)
def make_ax(fig, gs):
ax = fig.add_subplot(gs)
ax.xaxis.set_major_formatter(plt.NullFormatter())
ax.yaxis.set_major_formatter(plt.NullFormatter())
return ax
fig = plt.figure(figsize=(15, 4))
gs = plt.GridSpec(4, 15, left=0.02, right=0.98, bottom=0.05, top=0.95, wspace=0.2, hspace=0.2)
ax0 = make_ax(fig, gs[:4, :4])
ax0.text(0.98, 0.98, "Random Initialization", transform=ax0.transAxes,
ha='right', va='top', size=16)
draw_points(ax0, 'gray', factor=2)
draw_centers(ax0, centers, factor=2)
for i in range(3):
ax1 = make_ax(fig, gs[:2, 4 + 2 * i:6 + 2 * i])
ax2 = make_ax(fig, gs[2:, 5 + 2 * i:7 + 2 * i])
# E-step
y_pred = pairwise_distances_argmin(X, centers)
draw_points(ax1, y_pred)
draw_centers(ax1, centers)
# M-step
new_centers = np.array([X[y_pred == i].mean(0) for i in range(4)])
draw_points(ax2, y_pred)
draw_centers(ax2, centers, alpha=0.3)
draw_centers(ax2, new_centers)
for i in range(4):
ax2.annotate('', new_centers[i], centers[i],
arrowprops=dict(arrowstyle='->', linewidth=1))
# Finish iteration
centers = new_centers
ax1.text(0.95, 0.95, "E-Step", transform=ax1.transAxes, ha='right', va='top', size=14)
ax2.text(0.95, 0.95, "M-Step", transform=ax2.transAxes, ha='right', va='top', size=14)
# Final E-step
y_pred = pairwise_distances_argmin(X, centers)
axf = make_ax(fig, gs[:4, -4:])
draw_points(axf, y_pred, factor=2)
draw_centers(axf, centers, factor=2)
axf.text(0.98, 0.98, "Final Clustering", transform=axf.transAxes,
ha='right', va='top', size=16)
fig.savefig('figs/05.11-expectation-maximization.png')
findfont: Font family ['Times'] not found. Falling back to DejaVu Sans. findfont: Font family ['Times'] not found. Falling back to DejaVu Sans.

from sklearn.datasets import make_moons
from sklearn.cluster import KMeans
X, y = make_moons(200, noise=.05, random_state=0)
labels = KMeans(2, random_state=0).fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=labels,
s=50, cmap='viridis');
C:\Users\wangf\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn( C:\Users\wangf\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn(

Given a set of points $P_k$ and a distance metric $d(x,y) = \ell_2(x-y)$ and $d(x,\, A) = \inf\{d(x,\, a) \mid a \in A\}$
$$ R_k = \{x \in X \mid d(x, P_k) \leq d(x, P_j)\; \text{for all}\; j \neq k\} $$

Similarly as for PCA, we can define a loss or cost function that can better formalize the K-means algorithm.
$$ \mathcal{L}(\mbf{\mu},y;\mbf{D}) = \sum_{i=1}^N \left| \left| \mbf{x}_i - \mbf{\mu}_{y_i} \right|\right|_2^2 $$Similarly as for PCA, we can define a loss or cost function that can better formalize the K-means algorithm.
$$ \mathcal{L}(\mbf{\mu},y;\mbf{D}) = \sum_{i=1}^N \left| \left| \mbf{x}_i - \mbf{\mu}_{y_i} \right|\right|_2^2 = \sum_{i=1}^N \min_{k \in K} \Big(\left| \left| \mbf{x}_i - \mbf{\mu}_{k} \right|\right|_2^2 \Big) $$Yes, we are guarantee that K-means will always converge yet to a LOCAL Optimum
For any dataset D and any number of clusters K, the K-means algorithm converges in a finite number of iterations, where convergence is measured by the loss ceasing the change
For any dataset $\mbf{D}$ and any number of clusters $K$, the K-means algorithm converges in a finite number of iterations, where convergence is measured by the loss $\mathcal{L}$ ceasing the change.
There are only two points in which the K-means algorithm changes its values.
We will show that both of these operations can never increase the value of $\mathcal{L}$. Assuming this is true, the rest of the argument is as follows.
It remains to show that the two steps can only decrease the loss.
Looking at a point $i$ suppose that the previous value of $y_i$ is $a$ and the new value is $b$. It must be the case that $|| \mbf{x}_i -\mbf{\mu}_b ||_2^2 \leq || \mbf{x}_i -\mbf{\mu}_a ||_2^2 $. Thus, changing from $a \mapsto b$ can only decrease the loss.
Consider the second form of the loss. The update computes $\mbf{\mu}_k$ as the mean of the data for which $y_i=k$ which is precisely the point to minimize squared distance. So, This step too can only decrease the loss.
Though convergence is guaranteed, it does NOT say it is fast (i.e. it may be slow to converge for large datasets).
There are techniques such as mini-batch K-means to speed up convergence.
Practical implications: It will converge but to get a good results you may need either:
A

Which one of these configuration is best? A or B?
B

Idea: Semi-random initialization by picking initial means as far from each other as possible
Choose $\mbf{\mu}_1$ arbitrarily between data points
Choose $\mbf{\mu}_1$ arbitrarily between data points
You want the point that’s as far away from all previous means as possible
K-mean++ paper idea is pretty "young" (from 2007)
max() across points when selecting the centroidsrandomness(K-means++) > randomness(Kmeans + Furthest-first) because with Furthest-first we select at random only at the start; then all the rest is deterministic. Kmeans++ at each cluster selection there is stochasticity involveddist = np.array([1.18518298, 1.30917493, 1.10973212, 2.24523519, 1.01625606])
pmf = dist/dist.sum()
dist = {{dist = np.array([1.18518298, 1.30917493, 1.10973212, 2.24523519, 1.01625606]);print(dist)}}
to {{pmf = dist/dist.sum()}}
pmf = {{print(pmf)}} simply with
pmf = dist/dist.sum()
fig, axs = plt.subplots(1, 2)
fig.set_figheight(3)
fig.set_figwidth(15)
# PDF
axs[0].stem(pmf, linefmt='b-', markerfmt='bo', basefmt='--')
axs[0].set_title('PMF')
axs[0].set_xlabel('Index of distance')
axs[0].set_ylabel('Probability')
axs[0].set_aspect('auto')
# CUMSUM
axs[1].plot(np.insert(pmf.cumsum(),0,0), 'o--')
axs[1].set_title('CMF')
axs[1].set_xlabel('Index of distance')
axs[1].set_ylabel('Cumlative Probability')
axs[1].set_aspect('auto')
plt.show()
findfont: Font family ['Times'] not found. Falling back to DejaVu Sans.
fig, axs = plt.subplots(1, 2)
fig.set_figheight(5)
fig.set_figwidth(20)
# PDF
axs[0].stem(pmf, linefmt='b-', markerfmt='bo', basefmt='--')
axs[0].set_title('PMF')
axs[0].set_xlabel('Index of distance')
axs[0].set_ylabel('Probability')
axs[0].set_aspect('auto')
# CUMSUM
axs[1].plot(np.insert(pmf.cumsum(),0,0), 'o--')
axs[1].set_title('CMF')
axs[1].set_xlabel('Index of distance')
axs[1].set_ylabel('Cumlative Probability')
axs[1].set_aspect('auto')
#axs[1].stem(pmf, linefmt='b-', markerfmt='bo', basefmt='--')
plt.show()
U = np.random.rand(1000) #random sample uniformly from [0, 1].
# u ~ U[0,1]
# for each sampled u.
# find the first bin for which u > cumsum.
# Get the index of that bin as the sample.
sampled_idx = np.argmin((U[:, None] > pmf.cumsum()), axis=1)
import random
random.choices(range(0,5), weights=pmf, k=1000) #support pmf, pmf, k=how many to sample
pmf.cumsum()
array([0.17262675, 0.36331343, 0.52495046, 0.85197815, 1. ])
U = np.array([0.13,0.25])
U[:, None] > pmf.cumsum()
array([[False, False, False, False, False],
[ True, False, False, False, False]])
np.argmin(U[:, None] > pmf.cumsum(), axis=1)
array([0, 1], dtype=int64)
U = np.random.rand(5)
sampled_idx = np.argmin((U[:, None] > pmf.cumsum()), axis=1)
_=plt.bar(*np.unique(sampled_idx,return_counts=True))
U = np.random.rand(10000)
sampled_idx = np.argmin((U[:, None] > pmf.cumsum()), axis=1)
#_ = plt.hist(sampled_idx, histtype='bar')
_=plt.bar(*np.unique(sampled_idx,return_counts=True))
k that gives large gap between k-1 means and k-means cost function.it is OK for the loss to go down but do not make the model (number of K) too complex.from sklearn.datasets import make_blobs
N_centers = 25
X, y = make_blobs(n_samples=1500,
n_features=2,
centers=N_centers,
cluster_std=.42,
shuffle=True,
random_state=0)
plt.scatter(X[:, 0], X[:, 1], c='black', marker='o', s=50)
plt.show()
def inverse_sampling(pmf, n_samples):
U = np.random.rand(n_samples)
sampled_idx = np.argmin((U[:, None] > pmf.cumsum()), axis=1)
return sampled_idx
def distance(X, centers):
P = X[:, np.newaxis, :] # Nx1xD
diff_sq = (P - centers)**2 # Nx1xD - KxD -> NxKxD # broadcasting
# more info on broadcasting https://numpy.org/doc/stable/user/basics.broadcasting.html
# dist = sum_over_axis(diff^2) then take argmin of dist across centers
diff_sq_sum = diff_sq.sum(axis=2)
return diff_sq_sum
def init(X, K, plusplus=True):
N, dim = X.shape
if not plusplus:
print('> Init with random sampling')
# we select k samples from the points we have randomly
idx = np.random.choice(N, K, replace=False)
# the centers are just the k-random sampled points
return X[idx]
else:
print('> Init with kmeans++')
# we select a point
idx_first = np.random.choice(N, 1)
# init the centers and distances
centers = X[idx_first]
dists = distance(X, centers)
for k in range(1, K): # for [1.....K-1]
# compute min across all k-centers, for all points
# dists is NxK so min_dist is Nx1
min_dist = dists.min(axis=1)
# select one of the N proportional to higher distance
idx = inverse_sampling(min_dist/min_dist.sum(), 1)
new_center = X[idx]
# compute distance between all points to new center
dist_new = distance(X, new_center)
# update pool of centers and distances
centers = np.vstack([centers, new_center])
dists = np.hstack([dists, dist_new])
# the centers are computed after the loop
return centers
def get_assignments(X, centers):
dist_sq = distance(X, centers) # get distance #Nxk
y = dist_sq.argmin(axis=1) # select argmin distance #Nx1
return y, dist_sq.min(axis=1) # Nx1 and the values inside go from [1...K]
def update_centers(X, y, K):
# select the points to the k center and remove N dimension with mean over axis 0
y_row = y.reshape((1, -1)) #1xN
k_col = np.arange(K).reshape(-1, 1) #Kx1
#M[i][j] = 1 if X[j,:] belongs to cluster i else 0
M = k_col == y_row #KxN
#cluster_sum[i,:] contains the sum of all vectors in cluster i
cluster_sum = M @ X #KxN @ NxD -> KxD
#counts[i,:] contains the number of vectors in cluster i
counts = np.sum(M, axis=1).reshape(-1, 1) #Kx1
return cluster_sum / counts #KxD
# Double check
np.random.seed(0) # make the method deterministic
from sklearn.metrics import pairwise_distances_argmin
centers = init(X, N_centers)
y_assignments, _ = get_assignments(X, centers)
labels = pairwise_distances_argmin(X, centers)
assert np.array_equal(labels, y_assignments), 'Assignment not correct!'
> Init with kmeans++
def kmeans(X, n_clusters):
np.random.seed(0) # make the method deterministic
iter = 1
# 1. init random sample the centers
centers = init(X, n_clusters)
loss = []
while True:
# 2. Assignment step
y_assignments, dist_sq = get_assignments(X, centers)
# distance between points and center, average over points
loss.append(dist_sq.mean())
# 3. Update step
new_centers = update_centers(X, y_assignments, n_clusters)
# 4. Check for convergence
# If two last losses are close enough we stop
if len(loss) > 2 and np.allclose(loss[-2], loss[-1], atol=1e-5):
break
# update cluster for next iter
centers = new_centers
print(f'> Iteration {iter}: loss {loss[-1]}')
iter += 1
return centers, y_assignments, loss
centers, y_est, loss = kmeans(X, N_centers)
> Init with kmeans++ > Iteration 1: loss 1.7296452170971994 > Iteration 2: loss 0.8938248433703359 > Iteration 3: loss 0.7897878328750901 > Iteration 4: loss 0.7802986159291134 > Iteration 5: loss 0.779175352001348 > Iteration 6: loss 0.778229997754058 > Iteration 7: loss 0.777968597663401 > Iteration 8: loss 0.7778315053744624
fig, ax = plt.subplots(1, 3, figsize=(20, 5))
ax[0].scatter(*X.T, c=y_est,
s=50, cmap='viridis')
ax[0].set_title('Result')
_ = ax[0].scatter(*centers.T,
s=250,
marker='*',
c='red',
label='centroids')
_ = ax[1].plot(loss)
_ = ax[1].set_title('Loss function')
_ = ax[2].scatter(*X.T, c=y,
s=50, cmap='viridis')
_ = ax[2].set_title('Expected')
The applications are inspired from this website
from sklearn.datasets import load_digits
digits = load_digits()
digits.data.shape
(1797, 64)
| Attribute | Value |
|---|---|
| Classes | 10 |
| Samples per class | ~180 |
| Samples total | 1797 |
| Dimensionality | 64 |
| Features | int 0-15 |
_ = plt.hist(digits.data.flatten())
np.random.seed(13) # fixing the seed to see same pictures
fig, ax = plt.subplots(2, 5, figsize=(8, 3))
idx = np.random.choice(digits.data.shape[0], 10)
labels = digits.target[idx, ...]
points = digits.data[idx, ...].reshape(-1, 8, 8)
for axi, vector, y in zip(ax.flat, points, labels):
axi.set(xticks=[], yticks=[])
axi.imshow(vector, interpolation='none', cmap='gray')
axi.set_title(y)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(digits.data,digits.target, test_size=0.1,
random_state=0, stratify=digits.target)
plt.bar(*np.unique(y_train, return_counts=True));
plt.bar(*np.unique(y_test, return_counts=True));
Informally, the mode is the most frequent element
u, counts = np.unique([1,2,3,4,1,1,1,], return_counts=True)
mode = u[counts.argmax()] # will print 1 since 1 is most frequent
# 4 time present
N_centers = 10 # we assume to know that there are 10 digits
centers, y_est, loss = kmeans(X_train, N_centers)
> Init with kmeans++ > Iteration 1: loss 1314.5627705627705 > Iteration 2: loss 756.8259891509574 > Iteration 3: loss 725.4550366992323 > Iteration 4: loss 716.4499767121214 > Iteration 5: loss 711.2049934919781 > Iteration 6: loss 708.236358647983 > Iteration 7: loss 705.2994723513523 > Iteration 8: loss 701.9520972059448 > Iteration 9: loss 698.6658028658622 > Iteration 10: loss 695.5144766870222 > Iteration 11: loss 693.5266489678621 > Iteration 12: loss 692.1208906914004 > Iteration 13: loss 691.3495761746228 > Iteration 14: loss 690.657571369928 > Iteration 15: loss 690.0861205462734 > Iteration 16: loss 689.7669567135049 > Iteration 17: loss 689.6771929699328 > Iteration 18: loss 689.6572694423297 > Iteration 19: loss 689.6474580308259 > Iteration 20: loss 689.6267235993998 > Iteration 21: loss 689.6114085109643
fig, ax = plt.subplots(2, 5, figsize=(8, 3))
centers_img = centers.reshape(10, 8, 8)
for axi, center in zip(ax.flat, centers_img):
axi.set(xticks=[], yticks=[])
axi.imshow(center, interpolation='none', cmap='gray')
KMeans is able to find clusters whose centers are recognizable digits, with perhaps the exception of 1 and 8.def get_mode(X):
values, counts = np.unique(X, return_counts=True)
return values[counts.argmax()]
# Nx1
labels = np.zeros_like(y_est)
# 10x1
center_label = np.zeros((N_centers), dtype=np.int32)
for i in range(N_centers):
mask = (y_est == i)
mode_y = get_mode(y_train[mask])
labels[mask] = mode_y
center_label[i] = mode_y
from sklearn.metrics import accuracy_score
accuracy_score(y_train, labels)
0.7043908472479901
# sum over axis of dimensionality of squared difference
# Nx1x64 vs 10x64 --> Nx10x64 --sum axis 2 ---> Nx10
# we match the test wrt to the center (there might be other classification methods)
L2_square = ((X_test[...,np.newaxis,:] - centers)**2).sum(axis=2)
# Nx10 -- argmin over axis --> Nx1
idx_cluster = L2_square.argmin(axis=1)
y_test_estimated = center_label[idx_cluster]
accuracy_score(y_test, y_test_estimated)
0.6722222222222223
digits is an easy dataset. Which means distance in the pixel-space contains useful informations.It is OK to implement things manually in a course but remember that you can use optimized implementation via sklearn.
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
y_kmeans = kmeans.predict(X)
t-SNE. Unlike PCA this method supports, multiple non-linear structures so you should see the digits more separated in 2D. For this use sklearn!¶# Note: this requires the ``pillow`` package to be installed
from sklearn.datasets import load_sample_image
china = load_sample_image("china.jpg")
fig, ax = plt.subplots(1, 1, figsize=(20, 15))
ax.set(xticks=[], yticks=[])
ax.imshow(china)
<matplotlib.image.AxesImage at 0x18a501596d0>
# Note: this requires the ``pillow`` package to be installed
from sklearn.datasets import load_sample_image
china = load_sample_image("china.jpg")
fig, ax = plt.subplots(1, 1, figsize=(20, 15))
ax.set(xticks=[], yticks=[])
ax.imshow(china)
<matplotlib.image.AxesImage at 0x18a5019d0d0>
WxHx3 where 3 is RGB but can also be seen as a 3D point cloud.data = china/255.0 # scale all points to 0...1
data = data.reshape(-1,3)
data = china/255.0 # scale all points to 0...1
data = data.reshape(-1,3)
data.shape
(273280, 3)
fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(projection='3d')
data_sampled = data[::10] #1 point every 10 to go faster
ax.scatter(*data_sampled.T, marker='.', color=data_sampled)
ax.set_xlabel('R')
ax.set_ylabel('G')
_ = ax.set_zlabel('B')
ax.view_init(elev=10, azim=(0))
from sklearn.cluster import KMeans
kmeans = KMeans(16)
kmeans.fit(data)
y_est = kmeans.predict(data) # predict cluster center (project)
new_colors = kmeans.cluster_centers_[y_est] # backproject to the colors centers
C:\Users\wangf\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn(
fig = plt.figure(figsize=(15, 15))
ax = fig.add_subplot(projection='3d')
data_sampled = new_colors[::10] # 1 point every 10 to go faster
ax.scatter(*data_sampled.T, marker='.', color=data_sampled, s=200)
ax.set_xlabel('R')
ax.set_ylabel('G')
_ = ax.set_zlabel('B')
ax.view_init(elev=10, azim=(0))
china_recolored = new_colors.reshape(china.shape)
fig, ax = plt.subplots(1, 2, figsize=(16, 6),
subplot_kw=dict(xticks=[], yticks=[]))
fig.subplots_adjust(wspace=0.05)
ax[0].imshow(china)
ax[0].set_title('Original Image (24 bit per pixel)', size=16) # 8 bit per channel, for each pixel
ax[1].imshow(china_recolored)
ax[1].set_title('16-color Image (4 bit per pixel + 24 bit per centroid)', size=16);
findfont: Font family ['Times'] not found. Falling back to DejaVu Sans.